{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import re\n",
    "import datetime\n",
    "from sklearn.decomposition import PCA\n",
    "import matplotlib.pylab as plt\n",
    "import matplotlib as mpl\n",
    "import numpy as np\n",
    "from sklearn.cluster import KMeans\n",
    "import datetime\n",
    "from functools import partial\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.metrics import r2_score\n",
    "import statsmodels.api as sm\n",
    "from scipy import stats\n",
    "import numpy as np\n",
    "from sklearn import preprocessing\n",
    "\n",
    "def write_to_csv(est2, filename):\n",
    "    with open(filename, \"wb\") as file:\n",
    "        file.write(est2.summary().as_csv())\n",
    "    file.close()\n",
    "\n",
    "\n",
    "def add_days(x, days = 1):\n",
    "    return x + datetime.timedelta(days)\n",
    "\n",
    "\n",
    "plt.style.use('fivethirtyeight')\n",
    "\n",
    "mpl.rcParams['figure.figsize'] = [22.0, 8.0]\n",
    "mpl.rcParams['axes.facecolor'] = 'FFFFFF' # Or any suitable colour...\n",
    "mpl.rcParams[\"axes.titlesize\"] =  \"large\"   # fontsize of the axes title\n",
    "mpl.rcParams[\"axes.labelsize\"] = \"large\"  # fontsize of the x any y labels\n",
    "mpl.rcParams[\"xtick.labelsize\"] = \"large\"\n",
    "mpl.rcParams[\"ytick.labelsize\"] = \"large\"\n",
    "fontsize =40\n",
    "label_fontsize = 25"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-23-a0dec864f441>:17: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  cases_daily.daily_cases[cases_daily.daily_cases <0] = 0\n"
     ]
    }
   ],
   "source": [
    "### get mobility data\n",
    "mobility_data = pd.read_csv(\"./data/mobility/2020_US_Region_Mobility_Report_11_12-2.csv\")\n",
    "mobility_data[\"date\"] = pd.to_datetime(mobility_data.date)\n",
    "\n",
    "\n",
    "mobility_data = mobility_data.dropna(subset=[\"census_fips_code\"])\n",
    "\n",
    "### get case data from NYT\n",
    "county_level_case_data = pd.read_csv(\"https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv\")\n",
    "#https://raw.githubusercontent.com/nytimes/covid-19-data/master/live/us-counties.csv\n",
    "county_level_case_data[\"date\"] = pd.to_datetime(county_level_case_data.date)\n",
    "county_level_case_data = county_level_case_data.dropna(subset=[\"fips\"])\n",
    "\n",
    "cum_data_per_fips = county_level_case_data[[\"date\", \"fips\", \"cases\"]].pivot( index=\"date\", columns=\"fips\", values=[\"cases\"]).fillna(0.0)\n",
    "cases_daily = pd.melt(cum_data_per_fips.diff().reset_index(), id_vars=[\"date\"])\n",
    "cases_daily.columns =[u'date', None, u'fips', u'daily_cases']\n",
    "cases_daily.daily_cases[cases_daily.daily_cases <0] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-24-bea571865497>:4: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  deaths_daily.daily_deaths[deaths_daily.daily_deaths <0] = 0\n"
     ]
    }
   ],
   "source": [
    "death_cum_data_per_fips = county_level_case_data[[\"date\", \"fips\", \"deaths\"]].pivot( index=\"date\", columns=\"fips\", values=[\"deaths\"]).fillna(0.0)\n",
    "deaths_daily = pd.melt(death_cum_data_per_fips.diff().reset_index(), id_vars=[\"date\"])\n",
    "deaths_daily.columns =[u'date', None, u'fips', u'daily_deaths']\n",
    "deaths_daily.daily_deaths[deaths_daily.daily_deaths <0] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "hospitalizations_cum_data_per_fips = county_level_case_data[[\"date\", \"fips\", \"deaths\"]].pivot( index=\"date\", columns=\"fips\", values=[\"deaths\"]).fillna(0.0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "county_level_case_data = pd.merge(pd.merge(county_level_case_data, cases_daily[[\"date\", \"fips\", \"daily_cases\"]], on=[\"date\", \"fips\"]), deaths_daily[[\"date\", \"fips\", \"daily_deaths\"]], on = [\"date\", \"fips\"])\n",
    "all_data= pd.merge(county_level_case_data, mobility_data,  right_on =[\"date\", \"census_fips_code\"], left_on=[\"date\", \"fips\"], how=\"left\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Fips code</th>\n",
       "      <th>POPESTIMATE2019</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1003.0</td>\n",
       "      <td>223234</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1073.0</td>\n",
       "      <td>658573</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1081.0</td>\n",
       "      <td>164542</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1089.0</td>\n",
       "      <td>372909</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1097.0</td>\n",
       "      <td>413210</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Fips code  POPESTIMATE2019\n",
       "0     1003.0           223234\n",
       "1     1073.0           658573\n",
       "2     1081.0           164542\n",
       "3     1089.0           372909\n",
       "4     1097.0           413210"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fips_code_county_population.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "fips_code_county_population  = pd.read_csv(\"./data/burbio/county_population_csv\")\n",
    "\n",
    "all_data = pd.merge(all_data, fips_code_county_population, left_on=\"fips\", right_on=\"Fips code\")\n",
    "\n",
    "all_data[\"cases_per_pop\"] = all_data[\"daily_cases\"]*1000.0/all_data[\"POPESTIMATE2019\"]\n",
    "all_data[\"deaths_per_pop\"] = all_data[\"daily_deaths\"]*1000.0/all_data[\"POPESTIMATE2019\"]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/hoktay/.pyenv/versions/3.8.0/envs/agent_based_seir/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3146: DtypeWarning: Columns (2,3) have mixed types.Specify dtype option on import or set low_memory=False.\n",
      "  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,\n"
     ]
    }
   ],
   "source": [
    "### Add state-wide R_0 data\n",
    "\n",
    "state_ro_data = pd.read_csv(\"https://d14wlfuexuxgcm.cloudfront.net/covid/rt.csv\")\n",
    "state_name_code_map = pd.read_csv(\"data/state_name_code_map.csv\")\n",
    "state_ro_data = pd.merge(state_ro_data, state_name_code_map, left_on=\"region\", right_on=\"Code\", how=\"inner\" )\n",
    "\n",
    "state_ro_data[\"date\"] = pd.to_datetime(state_ro_data[\"date\"])\n",
    "state_ro_data[\"R_0\"] = state_ro_data[\"mean\"]\n",
    "\n",
    "### doing left join, as for some states the estimates start in the month of February.\n",
    "all_data = pd.merge(all_data, state_ro_data[[\"date\", \"R_0\", \"State\"]], left_on=[\"date\", \"state\"], right_on=[\"date\", \"State\"], how=\"left\")\n",
    "\n",
    "### Add oxford state level data\n",
    "oxford_subnational_data = pd.read_csv(\"https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/OxCGRT_latest.csv\")\n",
    "oxford_us_data = oxford_subnational_data[oxford_subnational_data[\"CountryCode\"]==\"USA\"]\n",
    "oxford_us_data = oxford_us_data[oxford_us_data[\"Jurisdiction\"] == \"STATE_TOTAL\"]\n",
    "\n",
    "all_columns = oxford_us_data.columns\n",
    "\n",
    "columns_to_remove=[\"CountryName\", \"CountryCode\", \"RegionCode\", \"Jurisdiction\", \"ConfirmedCases\",\"ConfirmedDeaths\" ]\n",
    "remaining_columns = list(set(all_columns) - set(columns_to_remove))\n",
    "\n",
    "oxford_us_data = oxford_us_data[remaining_columns].fillna(-1)\n",
    "\n",
    "oxford_us_data[\"Date\"] = pd.to_datetime(oxford_us_data.Date.apply(str))\n",
    "\n",
    "index_features = []\n",
    "for c in oxford_us_data.columns:\n",
    "    if \"Index\" in c:\n",
    "        index_features.append(c)\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "oxford_features = list(set(list(oxford_us_data.columns)) - set([\"RegionName\", \"Date\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "dummy_features = list(set(oxford_features) - set(index_features))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "oxford_data_with_dummies = pd.concat([oxford_us_data[[\"Date\", \"RegionName\",\"H4_Emergency investment in healthcare\"] + index_features], pd.get_dummies(oxford_us_data[dummy_features].apply(lambda x: x.astype(object), axis = 0))], axis = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_data = pd.merge(all_data, oxford_data_with_dummies, left_on=[\"date\", \"state\"], right_on=[\"Date\", \"RegionName\"], how=\"inner\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "dummy_features_with_oxford = list(set(oxford_data_with_dummies.columns) - set([\"RegionName\", \"Date\"]) - set(index_features))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "mobility_features=[\n",
    "               \"retail_and_recreation_percent_change_from_baseline\",\n",
    "           \"grocery_and_pharmacy_percent_change_from_baseline\",\n",
    "           \"parks_percent_change_from_baseline\",\n",
    "           \"transit_stations_percent_change_from_baseline\",\n",
    "           \"workplaces_percent_change_from_baseline\",\n",
    "           \"residential_percent_change_from_baseline\",\n",
    "            \"cases_per_pop\",\n",
    "            \"deaths_per_pop\"\n",
    ",        \"R_0\"\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "numeric_features = mobility_features + index_features "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "weekly_data = all_data[numeric_features + dummy_features_with_oxford + [\"fips\", \"date\"]].groupby(\"fips\").resample('W-Wed', label=\"right\", closed=\"right\", on=\"date\").mean().sort_values(by=\"date\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "weekly_data = weekly_data.reset_index()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "burbio_weekly_data = pd.read_csv(\"./data/burbio/burbio_weekly_aggregated_data.csv\")\n",
    "burbio_weekly_data[\"date\"] = pd.to_datetime(burbio_weekly_data.dt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "weekly_data = pd.merge(weekly_data, burbio_weekly_data, on=[\"fips\", \"date\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "burbuio_features = [u'gr 6-8_H', u'gr 6-8_T', u'gr 6-8_V', u'gr 6-8_Vac', u'gr 9-12_H',\n",
    "       u'gr 9-12_T', u'gr 9-12_V', u'gr 9-12_Vac', u'k-5_H', u'k-5_T',\n",
    "       u'k-5_V', u'k-5_Vac']\n",
    "\n",
    "for i in range(1, 5):\n",
    "    weekly_data[\"date\" + \"_minus_{}_weeks\".format(i)] = weekly_data.date.apply(partial(add_days, days = i*-7))\n",
    "    #all_data[\"date\" + \"_minus_{}\".format(i)] = all_data.date.apply(partial(add_days, days = i*-7))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "numeric_features = mobility_features + index_features + burbuio_features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(1, 5):\n",
    "    weekly_data = pd.merge(weekly_data, weekly_data[[\"date\", \"fips\"] + numeric_features], left_on=[\"date_minus_{}_weeks\".format(i), \"fips\"], \n",
    "         right_on=[\"date\", \"fips\"], suffixes=(\"\", \"_prior_week_{}\".format(i)), how=\"left\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "\n",
    "data_with_ethnicity  = pd.read_csv(\"./data/county_demo/county_data_with_normalized_ethnicity.csv\")\n",
    "with open('./data/county_demo/variable_name_description.pickle', 'rb') as handle:\n",
    "    var_names_descr = pickle.load(handle)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "weekly_data = pd.merge(weekly_data,data_with_ethnicity, left_on=\"fips\", right_on=\"FIPS\",  how = \"inner\" )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "features = burbuio_features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "for c in weekly_data.columns:\n",
    "    if \"percent_change_from_baseline_\" in c:\n",
    "        features.append(c)\n",
    "    if \"prior_week\" in c and \"date\" not in c and \"R_0\" not in c:\n",
    "        #continue\n",
    "        features.append(c)\n",
    "#     if \"Var\" in c:\n",
    "#         features.append(c)\n",
    "        \n",
    "features = list(set(features))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "features = features + list(var_names_descr.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "cases_per_pop_features = []\n",
    "virtual_features = [\"k-5_V\",\"gr 6-8_V\",\"gr 9-12_V\"]\n",
    "for f in features:\n",
    "    if \"_Vac\" in f:\n",
    "        if \"k-5_Vac\" not in f:\n",
    "            continue\n",
    "    if \"_V\" in f and \"_Vac\" not in f:\n",
    "        continue\n",
    "    if \"Index\" in f:\n",
    "        continue\n",
    "    if \"_prior_week_\" in f:\n",
    "        if \"_prior_week_2\" in f:\n",
    "            cases_per_pop_features.append(f)\n",
    "    else:\n",
    "        cases_per_pop_features.append(f)\n",
    "        \n",
    "for f in dummy_features_with_oxford:\n",
    "    if \"C2_Workplace closing\" in f:\n",
    "        cases_per_pop_features.append(f)\n",
    "    if \"C2_Flag\" in f:\n",
    "        cases_per_pop_features.append(f)\n",
    "    if \"C3_Cancel public events\" in f:\n",
    "        cases_per_pop_features.append(f)\n",
    "    if \"C3_Flag\" in f:\n",
    "        cases_per_pop_features.append(f)\n",
    "    if \"C4_Restrictions on gatherings\" in f:\n",
    "        cases_per_pop_features.append(f)\n",
    "    if \"C4_Flag\" in f:\n",
    "        cases_per_pop_features.append(f)\n",
    "    if \"C5_Close public transport\" in f:\n",
    "        cases_per_pop_features.append(f)\n",
    "    if \"C5_Flag\" in f:\n",
    "        cases_per_pop_features.append(f)\n",
    "    if \"C6_Stay at home requirements\" in f:\n",
    "        cases_per_pop_features.append(f)\n",
    "    if \"C6_Flag\" in f:\n",
    "        cases_per_pop_features.append(f)\n",
    "    if \"C7_Restrictions on internal movement\" in f:\n",
    "        cases_per_pop_features.append(f)\n",
    "    if \"C7_Flag\" in f:\n",
    "        cases_per_pop_features.append(f)\n",
    "    if \"C8_International travel controls\" in f:\n",
    "        cases_per_pop_features.append(f)\n",
    "\n",
    "    if \"H1_Public information campaigns\" in f:\n",
    "        cases_per_pop_features.append(f)\n",
    "    if \"H2_Testing policy\" in f:\n",
    "        cases_per_pop_features.append(f)\n",
    "    if \"H3_Contact tracing\" in f:\n",
    "        cases_per_pop_features.append(f)\n",
    "    if \"H6_Flag\" in f:\n",
    "        cases_per_pop_features.append(f)\n",
    "    if \"H6_Facial Coverings\" in f:\n",
    "        cases_per_pop_features.append(f)\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "cases_per_pop_features.append(\"H4_Emergency investment in healthcare_scaled\")\n",
    "#cases_per_pop_features.append(\"H5_Investment in vaccines\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "epsilon=.0001\n",
    "weekly_data[\"cases_per_pop_log\"] = np.log2(weekly_data[\"cases_per_pop\"]+epsilon)\n",
    "weekly_data[\"deaths_per_pop_log\"] = np.log2(weekly_data[\"deaths_per_pop\"]+epsilon)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import preprocessing\n",
    "scaler = preprocessing.StandardScaler().fit(weekly_data[[\"H4_Emergency investment in healthcare\"]])\n",
    "x_scaled = scaler.transform(weekly_data[[\"H4_Emergency investment in healthcare\"]])\n",
    "weekly_data[\"H4_Emergency investment in healthcare_scaled\"] = pd.DataFrame(x_scaled)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "weekly_data.to_csv(\"./data/weekly_data_transformed_feb_22.csv\".format(**locals()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "d  = pd.DataFrame.from_dict(var_names_descr, orient=\"index\").reset_index()\n",
    "d.columns = [\"variable\", \"description\"]\n",
    "d.to_csv(\"./data/county_demo/demographic_data_dictionary.csv\".format(**locals()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
